---
title: "Make figures for The Great Leveler? Juvenile Arrest, College Attainment, and the Future of American Inequality"
author: "Garrett Baker (garrett.baker@duke.edu)"
knit: (function(inputFile, encoding) {
  rmarkdown::render(inputFile, encoding = encoding, output_dir = "filepath") })
date: "`r Sys.Date()`"
output: html_document
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```


```{r}
library(tidyverse)
library(haven)
library(hrbrthemes)
```


## FIGURE 1. COLLEGE GRADUATION BY GROUP

```{r}
# Load and wrangle data
d_desc1 <- read_dta("filepath/arrest_educ_cleandata.dta") %>% 
  filter(!is.na(arrestedbefore_19),
         !is.na(w4w5_college4yr),
         sp_ethn_aw != 4,
         cohort_numeric == 0 | cohort_numeric == 9) %>% 
  dplyr::select(arrestedbefore_19, 
         arrested_19_cat,
         w4w5_college4yr,
         sp_ethn_aw,
         sex,
         cohort_numeric)


d_desc2 <- d_desc1 %>% 
  transmute(gender = factor(case_when(sex == 1 ~ "Male",
                                      TRUE ~ "Female")),
            race = factor(case_when(
              sp_ethn_aw == 1 ~ "White",
              sp_ethn_aw == 2 ~ "Black",
              sp_ethn_aw == 3 ~ "Hispanic",
              TRUE ~ "OtherRace"
            )),
            cohort = factor(case_when(cohort_numeric == 0 ~ "Cohort0",
                                      TRUE ~ "Cohort9")),
            college = factor(case_when(w4w5_college4yr == 0 ~ "no",
                                       TRUE ~ "yes")),
            arrest_cat = factor(arrested_19_cat),
            arrest_19 = factor(arrestedbefore_19)
            ) %>% 
  pivot_longer(1:3,
               names_to = "group",
               values_to = "levels")

# Create working dataframe
d_desc3 <- d_desc2 %>% 
  group_by(group, levels, arrest_19, college) %>% 
  summarize(n = n()) %>% 
  mutate(percent = n/sum(n)*100) %>% 
  dplyr::select(-n)

# Create labels
facet_labels <- c("0" = "Never Arrested", "1" = "Arrested Before 19", "2" = "Arrested After 22.5")
demog_labels <- c("White", "Black", "Hispanic", "OtherRace", "Male", "Female", "Cohort0", "Cohort9")

# Create CI bars
#d_desc3$prop <- d_desc3$percent/100 ## convert percent back to proportion
d_desc3$se <- sqrt(d_desc3$percent*(1-d_desc3$percent)/d_desc3$percent) ## calculate SE
d_desc3$ci_upper <- d_desc3$percent+(1.96*d_desc3$se) ## upper
d_desc3$ci_lower <- d_desc3$percent-(1.96*d_desc3$se) ## lower

# Create new columns to use
d_desc3$demog_ordered <- factor(d_desc3$levels, levels = demog_labels)
d_desc3$demog_ordered_rev <- factor(d_desc3$levels, levels = rev(demog_labels))
d_desc3$group_ordered <- factor(d_desc3$group, levels = c("race", "gender", "cohort"))
d_desc3$percent_rounded <- round(d_desc3$percent, digit = 1)

```


```{r}
# Make plot

p1 <- d_desc3 %>%
  filter(college == "yes",
         levels != "OtherRace") %>% 
  ggplot(aes(x = percent, y = demog_ordered_rev, fill = arrest_19)) +
  geom_col(orientation = "y",
           width = 0.9,
           position = "dodge",
           alpha = 0.8,
           color = "black",
           size = .9) +
  scale_fill_manual(values = c("0" = "gray70",
                                "1" = "black"),
                    labels = c("0" = "Never Arrested + Later Arrested",
                               "1" = "Arrested by 19")) +
  labs(y = NULL,
       x = "Percent") +
  theme_minimal(base_size = 14) +
  theme(legend.position = "bottom",
        legend.title = element_blank(),
        legend.box.spacing = unit(0, "pt"),
        plot.title = element_text(face = "bold"))

print(p1)
ggsave("filepath/figure1.png", p1)
```












## FIGURE 2. HETEROGENEITY BY DEMOGRAPHICS

```{r}
d_demog <- read_dta("filepath/heterogeneity_plot.dta")

d_demog <- d_demog %>% 
  filter(grepl("arrestedbefore_19", var))

order <- c("Cohort9", "Cohort0", "Female", "Male", "White", "Hispanic", "Black")

# For R&R we're adding 90% CIs 
z90 <- qnorm(0.95) 
z95 <- qnorm(0.975)

d_demog <- d_demog %>% 
  mutate(ci_90L = coef - stderr * z90,
         ci_90u = coef + stderr * z90,
         ci_95l = coef - stderr * z95, #this column should duplicate ci_lower
         ci_95u = coef + stderr * z95) #this column should duplicate ci_upper
```



```{r, fig.width=7}
d_demog <- d_demog %>%
  mutate(group = case_when(
    model %in% c("Cohort9", "Cohort0") ~ "Cohort",
    model %in% c("Female", "Male") ~ "Gender",
    model %in% c("Black", "Hispanic", "White") ~ "Race",
    TRUE ~ NA_character_),
    group = factor(group, levels = c("Race", "Gender", "Cohort")),
    model = factor(model, levels = c("White", "Hispanic", "Black", 
                                     "Male", "Female",             
                                     "Cohort9", "Cohort0")))

p2 <- ggplot(d_demog, aes(y = model, x = coef, shape = model,
               xmin = ci_lower, xmax = ci_upper)) +
  geom_errorbarh(aes(xmin = ci_90L, xmax = ci_90u), 
                 position = position_dodge(.5),
                 linewidth = 2,
                 height = 0) +
  geom_errorbarh(aes(xmin = ci_lower, xmax = ci_upper), 
                 position = position_dodge(.5),
                 linewidth = .8,
                 height = 0,
                 alpha = 0.5) + 
  geom_point(size = 2.8, 
             fill = "white", 
             stroke = 2) +
  geom_vline(xintercept = 0, 
             linetype = "dashed", 
             color = "black",
             linewidth = .8) +
  labs(x = "Coefficient",
       y = NULL) +
  #scale_y_discrete(limits = d_demog$model) +
  scale_x_continuous(breaks = seq(-0.5, 0.1, 0.1)) +
  coord_cartesian(xlim = c(-0.55, 0.15)) +
  scale_shape_manual(values = c("Cohort9" = 23,
                                "Cohort0" = 23,
                                "Female" = 21,
                                "Male" = 21,
                                "White" = 22,
                                "Hispanic" = 22,
                                "Black" = 22)) +
  facet_grid(rows = vars(group),
             scales = "free_y",
             space = "free") +
  theme_bw(base_size = 15) +
  theme(legend.position = "none",
        panel.grid = element_blank(),
        strip.text = element_blank(),
        strip.background = element_blank(),
        axis.text.y = element_text(size = 14))

p2
ggsave("filepath/figure2.png", p2)
```



## FIGURE 3. HETEROGENEITY BY SES FIGURE

```{r}
d_ses <- read_dta("filepath/heterogeneity_plot2.dta")

d_ses <- d_ses %>% 
  filter(grepl("arrestedbefore_19", var),
         !grepl("Public", model))


d_ses <- d_ses %>% 
  mutate(ci_90L = coef - stderr * z90,
         ci_90u = coef + stderr * z90,
         ci_95l = coef - stderr * z95, #this column should duplicate ci_lower
         ci_95u = coef + stderr * z95) #this column should duplicate ci_upper
```


```{r, fig.width=7}
d_ses <- d_ses %>%
  mutate(group = case_when(
    model %in% c("High Nhood SES", "Medium Nhood SES", "Low Nhood SES") ~ "Neighborhood",
    model %in% c("PC Graduate HS", "PC Dropout HS") ~ "PC HS",
    TRUE ~ NA_character_),
    group = factor(group, levels = c("Neighborhood", "PC HS")),
    model = factor(model, levels = c("Low Nhood SES", "Medium Nhood SES", "High Nhood SES", 
                                     "PC Graduate HS", "PC Dropout HS")))

p3 <- ggplot(d_ses, aes(y = model, x = coef, shape = model,
               xmin = ci_lower, xmax = ci_upper)) +
  geom_errorbarh(aes(xmin = ci_lower, xmax = ci_upper), 
                 position = position_dodge(.5),
                 linewidth = .8,
                 height = 0,
                 alpha = 0.5) + 
  geom_errorbarh(aes(xmin = ci_90L, xmax = ci_90u), 
                 position = position_dodge(.5),
                 linewidth = 2,
                 height = 0) +
  geom_point(size = 2.5, 
             fill = "white", 
             stroke = 2) +
  geom_vline(xintercept = 0, 
             linetype = "dashed", 
             color = "black",
             linewidth = .8) +
  scale_shape_manual(values = c("PC Dropout HS" = 25,
                                "PC Graduate HS" = 25,
                                "Low Nhood SES" = 24,
                                "Medium Nhood SES" = 24,
                                "High Nhood SES" = 24)) +
  facet_grid(rows = vars(group),
             scales = "free_y",
             space = "free") +
  labs(x = "Coefficient",
       y = NULL) +
  #scale_y_discrete(limits = order2) +
  scale_x_continuous(breaks=seq(-0.5, 0.1, 0.1)) +
  coord_cartesian(xlim = c(-0.54, 0.13)) +
  theme_bw(base_size = 15) +
  theme(legend.position = "none",
        panel.grid = element_blank(),
        strip.text = element_blank(),
        strip.background = element_blank(),
        axis.text.y = element_text(size = 14))

p3
ggsave("filepath/figure3.png", p3)
```














# APPENDIX

# FIGURE A2. Demographics heterogeneity Firth 


```{r}
d_demogfirth <- read_dta("filepath/heterogeneity_plot_firth.dta")

d_demogfirth <- d_demogfirth %>% 
  filter(grepl("arrestedbefore_19", var))


d_demogfirth <- d_demogfirth %>% 
  mutate(ci_90L = coef - stderr * z90,
         ci_90u = coef + stderr * z90,
         ci_95l = coef - stderr * z95, #this column should duplicate ci_lower
         ci_95u = coef + stderr * z95) #this column should duplicate ci_upper
```



```{r, fig.width=7}
d_demogfirth <- d_demogfirth %>%
  mutate(group = case_when(
    model %in% c("Cohort9", "Cohort0") ~ "Cohort",
    model %in% c("Female", "Male") ~ "Gender",
    model %in% c("Black", "Hispanic", "White") ~ "Race",
    TRUE ~ NA_character_),
    group = factor(group, levels = c("Race", "Gender", "Cohort")),
    model = factor(model, levels = c("White", "Hispanic", "Black", 
                                     "Male", "Female",             
                                     "Cohort9", "Cohort0")))

pa2 <- ggplot(d_demogfirth, aes(y = model, x = coef, shape = model,
               xmin = ci_lower, xmax = ci_upper)) +
  geom_errorbarh(aes(xmin = ci_90L, xmax = ci_90u), 
                 position = position_dodge(.5),
                 linewidth = 2,
                 height = 0) +
  geom_errorbarh(aes(xmin = ci_lower, xmax = ci_upper), 
                 position = position_dodge(.5),
                 linewidth = .8,
                 height = 0,
                 alpha = 0.5) + 
  geom_point(size = 2.8, 
             fill = "white", 
             stroke = 2) +
  geom_vline(xintercept = 0, 
             linetype = "dashed", 
             color = "black",
             linewidth = .8) +
  labs(x = "Coefficient",
       y = NULL) +
  scale_shape_manual(values = c("Cohort9" = 23,
                                "Cohort0" = 23,
                                "Female" = 21,
                                "Male" = 21,
                                "White" = 22,
                                "Hispanic" = 22,
                                "Black" = 22)) +
  facet_grid(rows = vars(group),
             scales = "free_y",
             space = "free") +
  theme_bw(base_size = 15) +
  theme(legend.position = "none",
        panel.grid = element_blank(),
        strip.text = element_blank(),
        strip.background = element_blank(),
        axis.text.y = element_text(size = 14))

pa2
ggsave("filepath/figurea2.png", pa2)
```



## FIGURE A3. SES heterogeneity Firth

```{r}
d_sesfirth <- read_dta("filepath/heterogeneity_plot2_firth.dta")

d_sesfirth <- d_sesfirth %>% 
  filter(grepl("arrestedbefore_19", var),
         !grepl("Public", model))


d_sesfirth <- d_sesfirth %>% 
  mutate(ci_90L = coef - stderr * z90,
         ci_90u = coef + stderr * z90,
         ci_95l = coef - stderr * z95, #this column should duplicate ci_lower
         ci_95u = coef + stderr * z95) #this column should duplicate ci_upper
```


```{r, fig.width=7}
d_sesfirth <- d_sesfirth %>%
  mutate(group = case_when(
    model %in% c("High Nhood SES", "Medium Nhood SES", "Low Nhood SES") ~ "Neighborhood",
    model %in% c("PC Graduate HS", "PC Dropout HS") ~ "PC HS",
    TRUE ~ NA_character_),
    group = factor(group, levels = c("Neighborhood", "PC HS")),
    model = factor(model, levels = c("Low Nhood SES", "Medium Nhood SES", "High Nhood SES", 
                                     "PC Graduate HS", "PC Dropout HS")))

pa3 <- ggplot(d_sesfirth, aes(y = model, x = coef, shape = model,
               xmin = ci_lower, xmax = ci_upper)) +
  geom_errorbarh(aes(xmin = ci_lower, xmax = ci_upper), 
                 position = position_dodge(.5),
                 linewidth = .8,
                 height = 0,
                 alpha = 0.5) + 
  geom_errorbarh(aes(xmin = ci_90L, xmax = ci_90u), 
                 position = position_dodge(.5),
                 linewidth = 2,
                 height = 0) +
  geom_point(size = 2.5, 
             fill = "white", 
             stroke = 2) +
  geom_vline(xintercept = 0, 
             linetype = "dashed", 
             color = "black",
             linewidth = .8) +
  scale_shape_manual(values = c("PC Dropout HS" = 25,
                                "PC Graduate HS" = 25,
                                "Low Nhood SES" = 24,
                                "Medium Nhood SES" = 24,
                                "High Nhood SES" = 24)) +
  facet_grid(rows = vars(group),
             scales = "free_y",
             space = "free") +
  labs(x = "Coefficient",
       y = NULL) +
  theme_bw(base_size = 15) +
  theme(legend.position = "none",
        panel.grid = element_blank(),
        strip.text = element_blank(),
        strip.background = element_blank(),
        axis.text.y = element_text(size = 14))

pa3
ggsave("filepath/figurea3.png", pa3)
```






## Figure A4 Age cutoff sensitivity

```{r}
d_age <- read_dta("filepath/age_cutoff_figure.dta")

d_age <- d_age %>% 
  filter(grepl("arrestedbefore", var))

orderage <- c("Age20", "Age19", "Age18")
```

```{r, fig.width=7}
pa4 <- ggplot(d_age, aes(y = model, x = coef,
               xmin = ci_lower, xmax = ci_upper)) +
  geom_errorbar(width = 0, 
                linewidth = 1.1,
                alpha = 0.9) +
  geom_point(size = 3.5) +
  geom_vline(xintercept = 0, 
             linetype = "dashed", 
             color = "black",
             linewidth = .5) +
  labs(x = "Coefficient",
       y = "") +
  scale_y_discrete(limits = orderage,
                   labels = c("Age 20", "Age 19", "Age 18")) +  
  theme_bw(base_size = 14) +
  theme(legend.position = "none",
        axis.text.x = element_text(size = 12),
        axis.text.y = element_text(size = 13),
        plot.title = element_text(size = 15),
        panel.grid = element_blank())

print(pa4)
ggsave("filepath/figurea4.png", pa4)
```

